In [1]:
# Importing necessary libraries
from googleapiclient.discovery import build # To interact with the YouTube Data API
from youtube_transcript_api import YouTubeTranscriptApi
from collections import Counter
import pandas as pd # For creating and handling DataFrames
import isodate # To parse ISO 8601 durations
from collections import Counter # To count occurrences of keywords
import matplotlib.pyplot as plt # For visualizing results
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
from transformers import pipeline
In [2]:
# YouTube API Key
API_KEY = 'AIzaSyCUe0mpORyGYYpF-5_XnnoL3Z25gFp_wc8'
In [3]:
# Initialize the YouTube API client
youtube = build('youtube', 'v3', developerKey=API_KEY)
In [8]:
# Function to search and fetch videos
def fetch_youtube_videos(query, max_results=100):
video_data = []
next_page_token = None
fetched_results = 0
while fetched_results < max_results:
results_to_fetch = min(max_results - fetched_results, 50)
# API Request
request = youtube.search().list(
q=query,
part="snippet",
type="video",
maxResults=results_to_fetch,
pageToken=next_page_token,
)
response = request.execute()
for item in response["items"]:
video_data.append({
"video_id": item["id"]["videoId"],
"title": item["snippet"]["title"],
"description": item["snippet"]["description"],
"channel_name": item["snippet"]["channelTitle"],
"channel_link": f"https://www.youtube.com/channel/{item['snippet']['channelId']}",
"publish_date": item["snippet"]["publishedAt"],
})
fetched_results += len(response["items"])
next_page_token = response.get("nextPageToken", None)
if not next_page_token:
break
return video_data
# Function to fetch additional video details
def fetch_video_details(video_ids):
video_details = []
for i in range(0, len(video_ids), 50):
chunk = video_ids[i : i + 50]
request = youtube.videos().list(
part="contentDetails,statistics", id=",".join(chunk)
)
response = request.execute()
for item in response["items"]:
duration_seconds = isodate.parse_duration(
item["contentDetails"]["duration"]
).total_seconds()
views = int(item["statistics"].get("viewCount", 0))
video_details.append({
"video_id": item["id"],
"duration": duration_seconds,
"views": views,
"likes": int(item["statistics"].get("likeCount", 0)),
"comments": int(item["statistics"].get("commentCount", 0)),
"dislikes": int(item["statistics"].get("dislikeCount", 0)) if "dislikeCount" in item["statistics"] else 0,
"valid": duration_seconds < 300 and views >= 1000, # Mark valid videos
})
return video_details
# Function to fetch transcripts
def fetch_transcripts(video_ids):
transcripts = {}
for video_id in video_ids:
try:
transcript = YouTubeTranscriptApi.get_transcript(video_id)
transcript_text = " ".join([t["text"] for t in transcript])
transcripts[video_id] = transcript_text
except Exception:
transcripts[video_id] = "Transcript not available"
return transcripts
# Fetch videos based on a search query
query = "Tinnitus UK"
raw_video_data = fetch_youtube_videos(query, max_results=300) # Fetch more data upfront
# Extract video IDs for further details
video_ids = [video["video_id"] for video in raw_video_data]
# Fetch additional details for the videos
detailed_video_data = fetch_video_details(video_ids)
# Separate valid and backup videos
valid_videos = [video for video in detailed_video_data if video["valid"]]
backup_videos = [video for video in detailed_video_data if not video["valid"]]
# Fetch transcripts for all valid and backup videos
valid_video_ids = [video["video_id"] for video in valid_videos]
backup_video_ids = [video["video_id"] for video in backup_videos]
all_transcripts = fetch_transcripts(valid_video_ids + backup_video_ids)
# Create DataFrames
video_df = pd.DataFrame(raw_video_data)
details_df = pd.DataFrame(valid_videos + backup_videos) # Combine valid and backup videos
final_df = pd.merge(video_df, details_df, on="video_id")
# Add transcripts to the DataFrame
final_df["transcript"] = final_df["video_id"].map(all_transcripts)
# Filter to ensure at least 100 rows
valid_final_df = final_df[final_df["valid"]].head(100) # Use only valid rows first
if len(valid_final_df) < 100:
backup_needed = 100 - len(valid_final_df)
backup_final_df = final_df[~final_df["valid"]].head(backup_needed)
final_df = pd.concat([valid_final_df, backup_final_df])
else:
final_df = valid_final_df
# Display the final DataFrame
from IPython.display import display
display(final_df)
| video_id | title | description | channel_name | channel_link | publish_date | duration | views | likes | comments | dislikes | valid | transcript | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Ofg_TGOY5y8 | British Tinnitus Association presents #ThisIsM... | For people living with tinnitus, there is no s... | Tinnitus UK | https://www.youtube.com/channel/UC3ktV24vBKJP3... | 2021-02-01T08:21:35Z | 35.0 | 72781 | 101 | 0 | 0 | True | Transcript not available |
| 1 | YyT9ZwWy5Jc | One Move for Instant Tinnitus Relief #Shorts | Dr. Rowe shows an easy exercise that can give ... | SpineCare Decompression and Chiropractic Center | https://www.youtube.com/channel/UC2lvPVVm0Mv53... | 2023-02-12T14:30:09Z | 41.0 | 1016121 | 35167 | 1557 | 0 | True | Here's one move for instant tinnitus, or tin-N... |
| 2 | 5IJwmPGY92c | Tinnitus sufferers at risk of isolation say ch... | Subscribe to 5 News: http://bit.ly/5NewsSub ▻ ... | 5 News | https://www.youtube.com/channel/UCsAKRVq2n1vcH... | 2019-02-04T21:00:00Z | 142.0 | 1076 | 0 | 4 | 0 | True | imagine trying to carry out everyday tasks wit... |
| 5 | 0GLJvPp5Xc8 | Where does the cure for tinnitus lie? | Leading experts discuss where they think a cur... | Tinnitus UK | https://www.youtube.com/channel/UC3ktV24vBKJP3... | 2019-08-29T13:30:04Z | 215.0 | 11346 | 128 | 0 | 0 | True | [Music] [Music] so I think there's a lot of wo... |
| 6 | BigSLy3Kij0 | How to Tackle Tinnitus - Part 1 | This Morning | Dr Chris is on hand with all the latest inform... | This Morning | https://www.youtube.com/channel/UChFsYLqPUyLiZ... | 2017-08-21T06:34:17Z | 175.0 | 92407 | 469 | 4 | 0 | True | we're going to do this tinnitus item and we're... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 258 | o4GV-EbnMfI | Epley Maneuver to Treat BPPV Dizziness | Perform Dix-Hallpike maneuver first to determi... | Fauquier ENT | https://www.youtube.com/channel/UCdioHiFoafczK... | 2021-02-18T04:53:55Z | 150.0 | 3828024 | 25950 | 766 | 0 | True | Normally only with head movement \ndoes fluid ... |
| 261 | bh1X5POvBq8 | Tinnitus | A brief example of the Tinnitus sounds I exper... | Tao Boogie | https://www.youtube.com/channel/UCY1MBFvSN9utz... | 2016-08-03T15:04:16Z | 41.0 | 2145 | 26 | 3 | 0 | True | Transcript not available |
| 267 | eh0Xq9t-HcA | Chris Martin about his Tinnitus | For more Coldplay video's, please subscribe! :... | ColdplayDaily | https://www.youtube.com/channel/UC_nWJmRVJuJLY... | 2018-10-16T15:00:05Z | 105.0 | 292690 | 4248 | 419 | 0 | True | I know you've suffered from tinnitus mmm about... |
| 276 | Sye7NynVvfE | Surgery for TMJ Pain (headaches, earaches, pop... | To download this animation and many more, empl... | Nucleus Medical Media | https://www.youtube.com/channel/UC85VW73bQLEjs... | 2022-02-17T15:00:14Z | 191.0 | 346729 | 4835 | 376 | 0 | True | Transcript not available |
| 281 | hLeZ1rJuDJo | Tinnitus relief in 10 seconds | Tinnitus #hearing,#acupuncture I wish every ti... | Nick Chitty | https://www.youtube.com/channel/UC_z7YjP02dMNc... | 2019-02-08T14:15:32Z | 83.0 | 58349 | 466 | 139 | 0 | True | hi guys it's Nick I just wanted to share with ... |
100 rows × 13 columns
In [5]:
# Calculate the number of rows and columns
num_rows, num_columns = final_df.shape
# Display the results
print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_columns}")
Number of rows: 100 Number of columns: 13
In [6]:
# Define a function to classify the type of content
def classify_content(title, description, transcript):
content = f"{title} {description} {transcript}".lower() # Combine all text fields
if "treatment" in content:
return "Treatment"
elif "relief" in content:
return "Relief"
elif "cure" in content:
return "Cure"
elif "cause" in content or "causes" in content:
return "Causes"
elif "therapy" in content:
return "Therapy"
elif "exercise" in content or "exercises" in content:
return "Exercises"
elif "stress" in content:
return "Stress"
else:
return "General Information"
# Apply the classification function to each video
final_df["content_type"] = final_df.apply(
lambda row: classify_content(row["title"], row["description"], row["transcript"]), axis=1
)
# Count the occurrences of each content type
content_counts = final_df["content_type"].value_counts()
# Display the results
print("Most Common Types of Tinnitus-Related Content in YouTube Videos Aimed at UK Audiences:")
print(content_counts)
# Plot the distribution of content types
plt.figure(figsize=(8, 5))
content_counts.plot(kind="bar", color="steelblue")
plt.title("Common Tinnitus-Related Content Types in UK YouTube Videos")
plt.xlabel("Content Type")
plt.ylabel("Frequency")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Most Common Types of Tinnitus-Related Content in YouTube Videos Aimed at UK Audiences: content_type Causes 37 General Information 26 Treatment 19 Relief 9 Cure 6 Stress 2 Therapy 1 Name: count, dtype: int64
In [7]:
# Define a function to classify the scientific grounding of the content
def classify_accuracy(title, description, transcript):
content = f"{title} {description} {transcript}".lower() # Combine all text fields
if "nhs" in content or "doctor" in content or "research" in content or "study" in content or "clinical" in content or "evidence-based" in content:
return "High Accuracy"
elif "therapy" in content or "relief" in content or "treatment" in content:
return "Medium Accuracy"
elif "cure" in content or "natural" in content or "alternative" in content or "holistic" in content:
return "Low Accuracy"
# Apply the classification function to each video
final_df["accuracy"] = final_df.apply(
lambda row: classify_accuracy(row["title"], row["description"], row["transcript"]), axis=1
)
# Plot the accuracy distribution as a violin plot
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(12, 6))
sns.violinplot(x="accuracy", y="views", data=final_df, order=["High Accuracy", "Medium Accuracy", "Low Accuracy"], palette="Set2")
plt.title("Violin Plot: Views Distribution by Accuracy Level")
plt.xlabel("Accuracy Level")
plt.ylabel("Views")
plt.tight_layout()
plt.show()
C:\Users\Dell\AppData\Local\Temp\ipykernel_16132\1579571597.py:21: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.violinplot(x="accuracy", y="views", data=final_df, order=["High Accuracy", "Medium Accuracy", "Low Accuracy"], palette="Set2")
In [12]:
# Define a function to classify the type of source
def classify_source(channel_name, description, transcript):
content = f"{channel_name} {description} {transcript}".lower() # Combine all text fields
if "doctor" in content or "clinic" in content or "healthcare" in content or "medical" in content:
return "Healthcare Professional"
elif "experience" in content or "my journey" in content or "personal story" in content:
return "Tinnitus Sufferer"
elif "alternative" in content or "natural cure" in content or "holistic" in content:
return "Alternative Therapy Promoter"
else:
return "General or Other"
# Apply the classification function to each video
final_df["source_type"] = final_df.apply(
lambda row: classify_source(row["channel_name"], row["description"], row["transcript"]), axis=1
)
# Count the occurrences of each source type
source_counts = final_df["source_type"].value_counts()
# Display the results in a pie chart
plt.figure(figsize=(8, 8))
source_counts.plot(
kind="pie",
autopct='%1.1f%%',
startangle=90,
colors=["#FF9999", "#66B3FF", "#99FF99", "#FFCC99"],
labels=source_counts.index
)
plt.title("Types of Sources Creating Tinnitus-Related YouTube Videos")
plt.ylabel("") # Remove default y-axis label
plt.show()
In [13]:
# Load the BERT sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
# Function to analyze tone using BERT
def analyze_tone(transcript):
if not isinstance(transcript, str) or transcript.strip() == "":
return "No Transcript", 0.0
# Perform sentiment analysis using BERT
result = sentiment_analyzer(transcript[:512]) # Limit to the first 512 tokens
sentiment = result[0]['label']
score = result[0]['score']
# Classify tone based on sentiment and confidence
if score < 0.7:
return "Neutral", score
elif sentiment == "POSITIVE":
return "Positive", score
else:
return "Negative", score
# Function to classify the approach
def classify_approach(title, description, transcript):
content = f"{title} {description} {transcript}".lower() # Combine all text fields
if "doctor" in content or "nhs" in content or "research" in content or "evidence-based" in content:
return "Scientific/Professional"
elif "experience" in content or "story" in content or "journey" in content:
return "Personal/Experiential"
elif "natural" in content or "alternative" in content or "cure" in content:
return "Alternative/Promotional"
else:
return "Other"
# Apply tone and approach analysis to the dataset
final_df["tone"], final_df["tone_score"] = zip(*final_df["transcript"].apply(analyze_tone))
final_df["approach"] = final_df.apply(
lambda row: classify_approach(row["title"], row["description"], row["transcript"]), axis=1
)
# Summarize tone distribution
tone_counts = final_df["tone"].value_counts().reset_index()
tone_counts.columns = ["Tone", "Count"]
# Summarize approach distribution
approach_counts = final_df["approach"].value_counts().reset_index()
approach_counts.columns = ["Approach", "Count"]
# Plotly: Tone Distribution
fig_tone = px.bar(
tone_counts,
x="Tone",
y="Count",
title="Tone Distribution of Videos",
color="Tone",
text="Count",
template="plotly"
)
fig_tone.update_traces(textposition="outside")
fig_tone.show()
# Plotly: Approach Distribution
fig_approach = px.pie(
approach_counts,
names="Approach",
values="Count",
title="Approach Distribution of Videos",
hole=0.4, # Donut chart
color_discrete_sequence=px.colors.qualitative.Set3
)
fig_approach.show()
# Display the first few rows of analysis
sample_analysis = final_df[["title", "tone", "tone_score", "approach"]].head()
# Create a table using Plotly graph_objects
fig_table = go.Figure(
data=[
go.Table(
header=dict(
values=["<b>Title</b>", "<b>Tone</b>", "<b>Tone Score</b>", "<b>Approach</b>"],
fill_color="lightblue",
align="left",
font=dict(color="black", size=12)
),
cells=dict(
values=[
sample_analysis["title"],
sample_analysis["tone"],
sample_analysis["tone_score"],
sample_analysis["approach"],
],
fill_color="white",
align="left",
font=dict(color="black", size=11)
)
)
]
)
fig_table.update_layout(title="Sample Analysis of Videos", title_x=0.5)
fig_table.show()
WARNING:tensorflow:From C:\Users\Dell\anaconda3\Lib\site-packages\tf_keras\src\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.
In [14]:
# Group videos by accuracy and calculate engagement metrics
engagement_summary = final_df.groupby("accuracy")[["views", "likes", "comments"]].mean()
# Display the engagement summary
print("Average Engagement Metrics by Accuracy Level:")
print(engagement_summary)
Average Engagement Metrics by Accuracy Level:
views likes comments
accuracy
High Accuracy 59972.483871 1257.870968 133.322581
Low Accuracy 246117.571429 2234.714286 419.714286
Medium Accuracy 162775.208333 2641.083333 425.166667
In [15]:
# Count tone and approach distribution
tone_counts = final_df["tone"].value_counts()
approach_counts = final_df["approach"].value_counts()
# Display tone and approach counts
print("\nTone Distribution:")
print(tone_counts)
print("\nApproach Distribution:")
print(approach_counts)
Tone Distribution: tone Negative 66 Positive 34 Name: count, dtype: int64 Approach Distribution: approach Other 52 Scientific/Professional 23 Personal/Experiential 19 Alternative/Promotional 6 Name: count, dtype: int64
In [16]:
# Correlate accuracy and engagement
correlation_summary = final_df.groupby("accuracy")[["views", "likes", "comments"]].sum()
print("\nEngagement Summary by Accuracy Level:")
print(correlation_summary)
# Display the correlation visually (heatmap)
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_summary, annot=True, fmt=".0f", cmap="Blues")
plt.title("Engagement Metrics by Accuracy Level")
plt.xlabel("Metrics")
plt.ylabel("Accuracy Level")
plt.tight_layout()
plt.show()
Engagement Summary by Accuracy Level:
views likes comments
accuracy
High Accuracy 1859147 38994 4133
Low Accuracy 1722823 15643 2938
Medium Accuracy 3906605 63386 10204
In [17]:
# Function to fetch comments for a video
def fetch_comments(video_id):
comments = []
try:
# Check if comments are enabled by fetching statistics
video_request = youtube.videos().list(part="statistics", id=video_id)
video_response = video_request.execute()
comment_count = int(video_response["items"][0]["statistics"].get("commentCount", 0))
if comment_count == 0:
return ["Comments Disabled"]
# Fetch comments if they are enabled
request = youtube.commentThreads().list(
part="snippet",
videoId=video_id,
maxResults=50 # Adjust as needed
)
response = request.execute()
# Extract comment text
for item in response.get("items", []):
comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
comments.append(comment)
except Exception as e:
print(f"Error fetching comments for video {video_id}: {e}")
return ["Error Fetching Comments"]
return comments
# Example usage: Fetch comments for the first video
final_df["comments_text"] = final_df["video_id"].apply(fetch_comments)
In [18]:
# Sentiment analysis function for comments
def analyze_comment_sentiment(comment_list):
if not isinstance(comment_list, list) or len(comment_list) == 0:
return "No Sentiment"
if "Comments Disabled" in comment_list or "Error Fetching Comments" in comment_list:
return "No Sentiment"
sentiments = [sentiment_analyzer(comment[:512])[0]["label"] for comment in comment_list]
# Return the most frequent sentiment
return max(set(sentiments), key=sentiments.count)
# Apply sentiment analysis on the fetched comments
final_df["comment_sentiment"] = final_df["comments_text"].apply(analyze_comment_sentiment)
# Summarize sentiment distribution
sentiment_counts = final_df["comment_sentiment"].value_counts()
print("\nComment Sentiment Distribution:")
print(sentiment_counts)
Comment Sentiment Distribution: comment_sentiment No Sentiment 48 NEGATIVE 45 POSITIVE 7 Name: count, dtype: int64
In [ ]: